datadir = "/Volumes/test_data"
sampleList = c("Diagenode_C15410196_1_50", "CST9733_1_100_H3K27me3")
alignResult = c()
for(sample in sampleList){
alignRes = read.table(paste0(datadir, "/alignment/sam/bowtie2_summary/", sample, "_bowtie2.txt"), header = FALSE, fill = TRUE)
alignRate = substr(alignRes$V1[6], 1, nchar(as.character(alignRes$V1[6]))-1)
alignResult = data.frame(Sample = sample, SequencingDepth = alignRes$V1[1] %>% as.character %>% as.numeric,
MappedFragNum_hg19 = alignRes$V1[4] %>% as.character %>% as.numeric + alignRes$V1[5] %>% as.character %>% as.numeric,
AlignmentRate_hg19 = alignRate %>% as.numeric) %>% rbind(alignResult, .)
}
alignResult %>% mutate(AlignmentRate_hg19 = paste0(AlignmentRate_hg19, "%"))dupResult = c()
for(sample in sampleList){
dupRes = read.table(paste0(datadir, "/removeDuplicate/picard_summary/", sample, "_picard.rmDup.txt"), header = TRUE, fill = TRUE)
dupResult = data.frame(Sample = sample, MappedFragNum_hg19 = dupRes$READ_PAIRS_EXAMINED[1] %>% as.character %>% as.numeric, DuplicationRate = dupRes$PERCENT_DUPLICATION[1] %>% as.character %>% as.numeric * 100) %>% mutate(UniqueFragNum = (MappedFragNum_hg19 * (1-DuplicationRate/100)) %>% round()) %>% rbind(dupResult, .)
}
alignDupSummary = left_join(alignResult, dupResult, by = c("Sample", "MappedFragNum_hg19")) %>% mutate(AlignmentRate_hg19 = paste0(AlignmentRate_hg19, "%"))
alignDupSummaryfragLen = c()
for(sample in sampleList){
fragLen = read.table(paste0(datadir, "/alignment/sam/fragmentLen/", sample, "_rmDup_fragmentLen.txt"), header = FALSE) %>% mutate(fragLen = V1 %>% as.numeric, fragCount = V2 %>% as.numeric, Weight = as.numeric(V2)/sum(as.numeric(V2)), Sample = sample) %>% rbind(fragLen, .)
}
fig1 = fragLen %>% ggplot(aes(x = Sample, y = fragLen, weight = Weight, fill = Sample)) +
theme_bw(base_size = 15) +
geom_violin(bw = 5) +
scale_y_continuous(breaks = seq(0, 800, 50)) +
ggpubr::rotate_x_text(angle = 20) +
ylab("Fragment Length") +
xlab("") +
theme(legend.position = "bottom")
fig2 = fragLen %>% ggplot(aes(x = fragLen, y = fragCount, color = Sample, group = Sample)) +
theme_bw(base_size = 15) +
geom_line(size = 1) +
xlab("Fragment Length") +
ylab("Count") +
coord_cartesian(xlim = c(0, 500))
ggarrange(fig1, fig2, ncol = 2)fragLen = c()
for(sample in sampleList){
fragLen = read.table(paste0(datadir, "/alignment/sam/fragmentLen/", sample, "_withDup_fragmentLen.txt"), header = FALSE) %>% mutate(fragLen = V1 %>% as.numeric, fragCount = V2 %>% as.numeric, Weight = as.numeric(V2)/sum(as.numeric(V2)), Sample = sample) %>% rbind(fragLen, .)
}
fig3 = fragLen %>% ggplot(aes(x = Sample, y = fragLen, weight = Weight, fill = Sample)) +
theme_bw(base_size = 15) +
geom_violin(bw = 5) +
scale_y_continuous(breaks = seq(0, 800, 50)) +
ggpubr::rotate_x_text(angle = 20) +
ylab("Fragment Length") +
xlab("") +
theme(legend.position = "bottom")
fig4 = fragLen %>% ggplot(aes(x = fragLen, y = fragCount, color = Sample, group = Sample)) +
theme_bw(base_size = 15) +
geom_line(size = 1) +
xlab("Fragment Length") +
ylab("Count") +
coord_cartesian(xlim = c(0, 500))
ggarrange(fig3, fig4, ncol = 2)# peak names simplified and all files moved to one directory
sampleList_all = c()
peak_caller = c("SEACR", "MACS2")
for(sample in sampleList){
for(caller in peak_caller){
name = paste0(sample, "_", caller)
sampleList_all = c(sampleList_all, name)
}
}
hg19_blacklist = ChIPseeker::readPeakFile(paste0(datadir, "/resources/hg19/hg19-blacklist.v2.bed.gz"), as = "GRanges")
blacklist_counts = c()
peakList = list()
peakList_unfilt = list()
for(sample in sampleList_all){
peak_unfilt = ChIPseeker::readPeakFile(paste0(datadir, "/peakCalling/all_peaks/", sample, ".bed"), as = "GRanges")
peakList_unfilt = c(peakList_unfilt, peak_unfilt)
blacklist_count = length(IRanges::subsetByOverlaps(peak_unfilt, hg19_blacklist))
peak_filt = IRanges::subsetByOverlaps(peak_unfilt, hg19_blacklist, invert = TRUE) %>% BRGenomics::tidyChromosomes(keep.X = TRUE, keep.Y = TRUE)
peakList = c(peakList, peak_filt)
blacklist_counts = c(blacklist_counts, blacklist_count)
}
names(peakList) = sampleList_all
names(peakList_unfilt) = sampleList_allsampleList_all_withDup = c()
for(sample in sampleList_all){
name = paste0(sample, "_", "withDup")
sampleList_all_withDup = c(sampleList_all_withDup, name)
}
peakList_withDup = list()
peakList_withDup_unfilt = list()
for(sample in sampleList_all_withDup){
peak_unfilt = ChIPseeker::readPeakFile(paste0(datadir, "/peakCalling/all_peaks/", sample, ".bed"), as = "GRanges")
peakList_withDup_unfilt = c(peakList_withDup_unfilt, peak_unfilt)
peak_filt = IRanges::subsetByOverlaps(peak_unfilt, hg19_blacklist, invert = TRUE) %>% BRGenomics::tidyChromosomes(keep.X = TRUE, keep.Y = TRUE)
peakList_withDup = c(peakList_withDup, peak_filt)
}
names(peakList_withDup) = sampleList_all_withDup
names(peakList_withDup_unfilt) = sampleList_all_withDupENCODE_H3K27ac = ChIPseeker::readPeakFile(paste0(datadir, "/resources/ENCODE/peaks/H3K27ac_ENCFF044JNJ.bed.narrowPeak"), as = "GRanges")
ENCODE_H3K27me3 = ChIPseeker::readPeakFile(paste0(datadir, "/resources/ENCODE/peaks/H3K27me3_ENCFF000BXB.bed.broadPeak"), as = "GRanges")
peakList_with_ENCODE = c(peakList, ENCODE_H3K27ac, ENCODE_H3K27me3)
sampleList_with_ENCODE = c(sampleList, "ENCODE_H3K27ac", "ENCODE_H3K27me3")
sampleList_all_with_ENCODE = c(sampleList_all, "ENCODE_H3K27ac", "ENCODE_H3K27me3")
peakList_with_ENCODE = setNames(peakList_with_ENCODE, sampleList_all_with_ENCODE)Total_CT_peaks = c()
Total_CT_peaks_withDup = c()
for(i in 1:length(peakList)){
Total_CT_peaks = c(Total_CT_peaks, length(peakList[[i]]))
Total_CT_peaks_withDup = c(Total_CT_peaks_withDup, length(peakList_withDup[[i]]))
}
Total_CT_peaks = data.frame(Sample = sampleList_all, Total_PeakN = Total_CT_peaks, Total_PeakN_withDup = Total_CT_peaks_withDup)
Total_CT_peaksblacklisted = data.frame(Sample = sampleList_all, Total_peaks = Total_CT_peaks$Total_PeakN, Blacklisted_peaks = blacklist_counts)
blacklisted$Proportion = blacklisted$Blacklisted_peaks / blacklisted$Total_peaks * 100
blacklistedpeakWidth = c()
for(peak in peakList){
peakW = GenomicRanges::width(peak) %>% summary() %>% round() %>% list()
peakWidth = c(peakWidth, peakW)
}
WSummary = c()
for(i in 1:length(peakWidth)){
WSummary = data.frame(Sample = sampleList_all[i], MinW = array(peakWidth[[i]][1]), MedianW = array(peakWidth[[i]][3]), MeanW = array(peakWidth[[i]][4]), MaxW = array(peakWidth[[i]][6])) %>% rbind(., WSummary)
}
WSummarycalculate_frips = function(sample_list, peak_list, dup){
inPeakData_df = c()
for(i in 1:length(sample_list)){
# to use single peak files
if(length(peak_list) > 1){
peakRes = data.frame(peak_list[[i]])
} else {
peakRes = data.frame(peak_list[[1]])
}
sample = sample_list[i]
peak.gr = GenomicRanges::GRanges(seqnames = peakRes$seqnames, IRanges(start = peakRes$start, end = peakRes$end), strand = "*")
bamFile = paste0(datadir, "/alignment/bam/", sample, "_", dup, "_bowtie2.mapped.bam")
fragment_counts = chromVAR::getCounts(bamFile, peak.gr, paired = TRUE, by_rg = FALSE, format = "bam")
inPeakN = counts(fragment_counts)[,1] %>% sum
inPeakData_df = rbind(inPeakData_df, data.frame(Sample = sample, inPeakN = inPeakN))
}
return(inPeakData_df)
}# use original peaks (unfiltered)
SEACR_peakList_unfilt = peakList_unfilt[seq(1, length(peakList_unfilt), by = 2)]
MACS2_peakList_unfilt = peakList_unfilt[seq(2, length(peakList_unfilt), by = 2)]
SEACR_peakList_unfilt_withDup = peakList_withDup_unfilt[seq(1, length(peakList_withDup_unfilt), by = 2)]
MACS2_peakList_unfilt_withDup = peakList_withDup_unfilt[seq(1, length(peakList_withDup_unfilt), by = 2)]SEACR_noDup_inPeakData = calculate_frips(sampleList, SEACR_peakList_unfilt, "rmDup")
write.table(SEACR_noDup_inPeakData, file = paste0(datadir, "/analysis/results/FRiPs/SEACR_noDup_inPeakData_2.csv"), sep = ",", col.names = TRUE, row.names = FALSE)SEACR_withDup_inPeakData = calculate_frips(sampleList, SEACR_peakList_unfilt_withDup, "withDup")
#write.table(SEACR_withDup_inPeakData, file = paste0(datadir, "/analysis/results/FRiPs/SEACR_withDup_inPeakData.csv"), sep = ",", col.names = TRUE, row.names = FALSE)MACS2_noDup_inPeakData = calculate_frips(sampleList, MACS2_peakList_unfilt, "rmDup")
#write.table(MACS2_noDup_inPeakData, file = paste0(datadir, "/analysis/results/FRiPs/MACS2_noDup_inPeakData.csv"), sep = ",", col.names = TRUE, row.names = FALSE)MACS2_withDup_inPeakData = calculate_frips(sampleList, MACS2_peakList_unfilt_withDup, "withDup")
#write.table(MACS2_withDup_inPeakData, file = paste0(datadir, "/analysis/results/FRiPs/MACS2_withDup_inPeakData.csv"), sep = ",", col.names = TRUE, row.names = FALSE)ENCODE_H3K27ac_noDup_inPeakData = calculate_frips(sampleList, list(ENCODE_H3K27ac), "rmDup")
#write.table(ENCODE_H3K27ac_noDup_inPeakData, file = paste0(datadir, "/analysis/results/FRiPs/ENCODE_H3K27ac_noDup_inPeakData.csv"), sep = ",", col.names = TRUE, row.names = FALSE)ENCODE_H3K27ac_withDup_inPeakData = calculate_frips(sampleList, list(ENCODE_H3K27ac), "withDup")
#write.table(ENCODE_H3K27ac_withDup_inPeakData, file = paste0(datadir, "/analysis/results/FRiPs/ENCODE_H3K27ac_withDup_inPeakData.csv"), sep = ",", col.names = TRUE, row.names = FALSE)ENCODE_H3K27me3_noDup_inPeakData = calculate_frips(sampleList, list(ENCODE_H3K27me3), "rmDup")
#write.table(ENCODE_H3K27me3_noDup_inPeakData, file = paste0(datadir, "/analysis/results/FRiPs/ENCODE_H3K27me3_noDup_inPeakData.csv"), sep = ",", col.names = TRUE, row.names = FALSE)ENCODE_H3K27me3_withDup_inPeakData = calculate_frips(sampleList, list(ENCODE_H3K27me3), "withDup")
#write.table(ENCODE_H3K27me3_withDup_inPeakData, file = paste0(datadir, "/analysis/results/FRiPs/ENCODE_H3K27me3_withDup_inPeakData.csv"), sep = ",", col.names = TRUE, row.names = FALSE)# save and load counts to speed up processing
SEACR_noDup_inPeakData = read.csv(paste0(datadir, "/analysis/results/FRiPs/SEACR_noDup_inPeakData.csv"))
SEACR_withDup_inPeakData = read.csv(paste0(datadir, "/analysis/results/FRiPs/SEACR_withDup_inPeakData.csv"))
MACS2_noDup_inPeakData = read.csv(paste0(datadir, "/analysis/results/FRiPs/MACS2_noDup_inPeakData.csv"))
MACS2_withDup_inPeakData = read.csv(paste0(datadir, "/analysis/results/FRiPs/MACS2_withDup_inPeakData.csv"))
ENCODE_H3K27ac_noDup_inPeakData = read.csv(paste0(datadir, "/analysis/results/FRiPs/ENCODE_H3K27ac_noDup_inPeakData.csv"))
ENCODE_H3K27ac_withDup_inPeakData = read.csv(paste0(datadir, "/analysis/results/FRiPs/ENCODE_H3K27ac_withDup_inPeakData.csv"))
ENCODE_H3K27me3_noDup_inPeakData = read.csv(paste0(datadir, "/analysis/results/FRiPs/ENCODE_H3K27me3_noDup_inPeakData.csv"))
ENCODE_H3K27me3_withDup_inPeakData = read.csv(paste0(datadir, "/analysis/results/FRiPs/ENCODE_H3K27me3_withDup_inPeakData.csv"))FRiP_data = alignDupSummary[c("Sample", "MappedFragNum_hg19", "UniqueFragNum")] %>%
left_join(SEACR_noDup_inPeakData, by = "Sample") %>%
left_join(SEACR_withDup_inPeakData, by = "Sample") %>%
left_join(MACS2_noDup_inPeakData, by = "Sample") %>%
left_join(MACS2_withDup_inPeakData, by = "Sample") %>%
left_join(ENCODE_H3K27ac_noDup_inPeakData, by = "Sample") %>%
left_join(ENCODE_H3K27ac_withDup_inPeakData, by = "Sample") %>%
left_join(ENCODE_H3K27me3_noDup_inPeakData, by = "Sample") %>%
left_join(ENCODE_H3K27me3_withDup_inPeakData, by = "Sample")
colnames(FRiP_data) = c("Sample", "MappedFragNum_hg19", "UniqueFragNum", "SEACR_noDup_inPeakN", "SEACR_withDup_inPeakN", "MACS2_noDup_inPeakN", "MACS2_withDup_inPeakN", "ENCODE_H3K27ac_noDup_inPeakN", "ENCODE_H3K27ac_withDup_inPeakN", "ENCODE_H3K27me3_noDup_inPeakN", "ENCODE_H3K27me3_withDup_inPeakN")
FRiP_data$SEACR_noDup_FRiP = FRiP_data$SEACR_noDup_inPeakN/FRiP_data$UniqueFragNum * 100
FRiP_data$SEACR_withDup_FRiP = FRiP_data$SEACR_withDup_inPeakN/FRiP_data$MappedFragNum_hg19 * 100
FRiP_data$MACS2_noDup_FRiP = FRiP_data$MACS2_noDup_inPeakN/FRiP_data$UniqueFragNum * 100
FRiP_data$MACS2_withDup_FRiP = FRiP_data$MACS2_withDup_inPeakN/FRiP_data$MappedFragNum_hg19 * 100
FRiP_data$ENCODE_H3K27ac_noDup_FRiP = FRiP_data$ENCODE_H3K27ac_noDup_inPeakN/FRiP_data$UniqueFragNum * 100
FRiP_data$ENCODE_H3K27ac_withDup_FRiP = FRiP_data$ENCODE_H3K27ac_withDup_inPeakN/FRiP_data$MappedFragNum_hg19 * 100
FRiP_data$ENCODE_H3K27me3_noDup_FRiP = FRiP_data$ENCODE_H3K27me3_noDup_inPeakN/FRiP_data$UniqueFragNum * 100
FRiP_data$ENCODE_H3K27me3_withDup_FRiP = FRiP_data$ENCODE_H3K27me3_withDup_inPeakN/FRiP_data$MappedFragNum_hg19 * 100
FRiP_dataFRiP_summary = FRiP_data[c("Sample", "SEACR_noDup_FRiP", "SEACR_withDup_FRiP", "MACS2_noDup_FRiP", "MACS2_withDup_FRiP", "ENCODE_H3K27ac_noDup_FRiP", "ENCODE_H3K27ac_withDup_FRiP", "ENCODE_H3K27me3_noDup_FRiP", "ENCODE_H3K27me3_withDup_FRiP")]
FRiP_summary_melt = melt(FRiP_summary, id.vars = c("Sample"))
FRiP_plot = ggplot(FRiP_summary_melt, aes(Sample, value)) +
theme_bw(base_size = 20) +
geom_bar(aes(fill = variable), width = 0.6, position = position_dodge(width = 0.8), stat = "identity", colour = "black") +
ylim(0, 100) +
ylab("% of fragments in peaks") +
xlab("Sample") +
ggtitle("Fragments in sample and ENCODE peaks") +
theme(legend.title = element_blank()) +
ggpubr::rotate_x_text(angle = 45) +
theme(legend.position = "bottom") +
scale_fill_manual(labels = c("SEACR noDup", "SEACR withDup", "MACS2 noDup", "MACS2 withDup", "ENCODE H3K27ac noDup", "ENCODE H3K27ac withDup", "ENCODE H3K27me3 noDup", "ENCODE H3K27me3 withDup"), values = c("violetred", "violet", "firebrick2", "orange1", "springgreen4", "chartreuse2", "royalblue2", "cyan3"))
FRiP_plotCumulative read enrichment across 1000bp genomic bins, scaled to 2M reads.
Fingerprint
counts = read.csv(paste0(datadir, "/correlation/hg19_500bp_overlaps_counts.txt"), sep = "\t", header = FALSE)
colnames(counts) = c("chr", "start", "end", sampleList_with_ENCODE)# same order as bam list
counts_corr = counts[4:ncol(counts)] %>% as.matrix() %>% preprocessCore::normalize.quantiles() %>% round() %>% cor(method = "pearson")
colnames(counts_corr) = sampleList_with_ENCODE
rownames(counts_corr) = sampleList_with_ENCODE
counts_corr[counts_corr <= 0] = 0counts_rounded = counts_corr %>% round(2)
heatmap.2(x = counts_rounded, col = "viridis", trace = "none", density.info = "none", cexRow = 1.5, cexCol = 1.5, notecol = "black", lhei = c(2, 11), lwid = c(2, 8), margins = c(20, 20))counts = read.csv(paste0(datadir, "/correlation/H3K27ac_ENCFF044JNJ_overlaps_counts.txt"), sep = "\t", header = FALSE)
colnames(counts) = c("chr", "start", "end", "name", "score", "strand", "signalValue", "pValue", "qValue", "peak", sampleList_with_ENCODE)# same order as bam list
counts_corr = counts[11:ncol(counts)] %>% as.matrix() %>% preprocessCore::normalize.quantiles() %>% round() %>% cor(method = "pearson")
colnames(counts_corr) = sampleList_with_ENCODE
rownames(counts_corr) = sampleList_with_ENCODE
counts_corr[counts_corr <= 0] = 0counts_rounded = counts_corr %>% round(2)
heatmap.2(x = counts_rounded, col = "viridis", trace = "none", density.info = "none", cexRow = 1.5, cexCol = 1.5, notecol = "black", lhei = c(2, 11), lwid = c(2, 8), margins = c(20, 20))DBA_mixed = NULL
for(i in 1:length(sampleList_all_with_ENCODE)){
DBA_mixed = DiffBind::dba.peakset(DBA = DBA_mixed, peaks = peakList_with_ENCODE[[i]], sampID = sampleList_all_with_ENCODE[i], peak.format = "bed")
}DiffBind::dba.plotHeatmap(DBA_mixed, lhei = c(1,5), lwid = c(1,5), keysize = 0.75, cexRow = 1.25, cexCol = 1.25, margin = 20)count_overlaps = function(peak_list, reference){
sample_peaks_in_ref = c()
for(i in 1:length(peak_list)){
overlaps_count = GenomicRanges::countOverlaps(query = peak_list[[i]], subject = reference)
sample_in_reference = overlaps_count[overlaps_count != 0] %>% length()
sample_peaks_in_ref = c(sample_peaks_in_ref, sample_in_reference)
}
return(sample_peaks_in_ref)
}
ENCODE_H3K27ac_overlapping_CT_peaks = count_overlaps(peakList, ENCODE_H3K27ac)
ENCODE_H3K27me3_overlapping_CT_peaks = count_overlaps(peakList, ENCODE_H3K27ac)
ENCODE_H3K27ac_overlapping_CT_peaks_withDup = count_overlaps(peakList_withDup, ENCODE_H3K27me3)
ENCODE_H3K27me3_overlapping_CT_peaks_withDup = count_overlaps(peakList_withDup, ENCODE_H3K27me3)
CT_in_ENCODE = data.frame(Sample = sampleList_all, CT_peaks_in_ENCODE_H3K27ac = ENCODE_H3K27ac_overlapping_CT_peaks, CT_peaks_in_ENCODE_H3K27me3 = ENCODE_H3K27me3_overlapping_CT_peaks, CT_peaks_in_ENCODE_H3K27ac_withDup = ENCODE_H3K27ac_overlapping_CT_peaks_withDup, CT_peaks_in_ENCODE_H3K27me3_withDup = ENCODE_H3K27me3_overlapping_CT_peaks_withDup)
CT_in_ENCODECollect C&T peaks in ENCODE:
CT_peaks_in_ENCODE_H3K27ac_set = c()
CT_peaks_in_ENCODE_H3K27me3_set = c()
for(i in 1:length(peakList)){
in_ENCODE_H3K27ac = IRanges::subsetByOverlaps(peakList[[i]], ENCODE_H3K27ac)
in_ENCODE_H3K27me3 = IRanges::subsetByOverlaps(peakList[[i]], ENCODE_H3K27me3)
CT_peaks_in_ENCODE_H3K27ac_set = c(CT_peaks_in_ENCODE_H3K27ac_set, in_ENCODE_H3K27ac)
CT_peaks_in_ENCODE_H3K27me3_set = c(CT_peaks_in_ENCODE_H3K27me3_set, in_ENCODE_H3K27me3)
}
CT_peaks_in_ENCODE_H3K27ac_set = setNames(CT_peaks_in_ENCODE_H3K27ac_set, sampleList_all)
CT_peaks_in_ENCODE_H3K27me3_set = setNames(CT_peaks_in_ENCODE_H3K27me3_set, sampleList_all)Collect C&T peaks not in ENCODE:
CT_peaks_not_in_ENCODE_H3K27ac_set = c()
CT_peaks_not_in_ENCODE_H3K27me3_set = c()
for(i in 1:length(peakList)){
not_in_ENCODE_H3K27ac = IRanges::subsetByOverlaps(peakList[[i]], ENCODE_H3K27ac, invert = TRUE)
not_in_ENCODE_H3K27me3 = IRanges::subsetByOverlaps(peakList[[i]], ENCODE_H3K27me3, invert = TRUE)
CT_peaks_not_in_ENCODE_H3K27ac_set = c(CT_peaks_not_in_ENCODE_H3K27ac_set, not_in_ENCODE_H3K27ac)
CT_peaks_not_in_ENCODE_H3K27me3_set = c(CT_peaks_not_in_ENCODE_H3K27me3_set, not_in_ENCODE_H3K27me3)
}
CT_peaks_not_in_ENCODE_H3K27ac_set = setNames(CT_peaks_not_in_ENCODE_H3K27ac_set, sampleList_all)
CT_peaks_not_in_ENCODE_H3K27me3_set = setNames(CT_peaks_not_in_ENCODE_H3K27me3_set, sampleList_all)count_overlaps_rev = function(peak_list, reference){
ref_peaks_in_sample = c()
for(i in 1:length(peak_list)){
overlaps_count = GenomicRanges::countOverlaps(query = reference, subject = peak_list[[i]])
reference_in_sample = overlaps_count[overlaps_count != 0] %>% length()
ref_peaks_in_sample = c(ref_peaks_in_sample, reference_in_sample)
}
return(ref_peaks_in_sample)
}
captured_H3K27ac_ENCODE_list = count_overlaps_rev(reference = ENCODE_H3K27ac, peak_list = peakList)
captured_H3K27me3_ENCODE_list = count_overlaps_rev(reference = ENCODE_H3K27me3, peak_list = peakList)
captured_H3K27ac_ENCODE_list_withDup = count_overlaps_rev(reference = ENCODE_H3K27ac, peak_list = peakList_withDup)
captured_H3K27me3_ENCODE_list_withDup = count_overlaps_rev(reference = ENCODE_H3K27me3, peakList_withDup)
Total_captured_ENCODE_peaks = data.frame(Sample = sampleList_all, ENCODE_H3K27ac_captured_peaks = captured_H3K27ac_ENCODE_list, ENCODE_H3K27me3_captured_peaks = captured_H3K27me3_ENCODE_list, ENCODE_H3K27ac_captured_peaks_withDup = captured_H3K27ac_ENCODE_list_withDup, ENCODE_H3K27me3_captured_peaks_withDup = captured_H3K27me3_ENCODE_list_withDup)
Total_captured_ENCODE_peaksCollect ENCODE peaks in C&T:
ENCODE_H3K27ac_peaks_in_CT_set = c()
ENCODE_H3K27me3_peaks_in_CT_set = c()
for(i in 1:length(peakList)){
in_CT_H3K27ac = IRanges::subsetByOverlaps(ENCODE_H3K27ac, peakList[[i]])
in_CT_H3K27me3 = IRanges::subsetByOverlaps(ENCODE_H3K27me3, peakList[[i]])
ENCODE_H3K27ac_peaks_in_CT_set = c(ENCODE_H3K27ac_peaks_in_CT_set, in_CT_H3K27ac)
ENCODE_H3K27me3_peaks_in_CT_set = c(ENCODE_H3K27me3_peaks_in_CT_set, in_CT_H3K27me3)
}
ENCODE_H3K27ac_peaks_in_CT_set = setNames(ENCODE_H3K27ac_peaks_in_CT_set, sampleList_all)
ENCODE_H3K27me3_peaks_in_CT_set = setNames(ENCODE_H3K27me3_peaks_in_CT_set, sampleList_all)Collect ENCODE peaks not in C&T:
ENCODE_H3K27ac_peaks_not_in_CT_set = c()
ENCODE_H3K27me3_peaks_not_in_CT_set = c()
for(i in 1:length(peakList)){
not_in_CT_H3K27ac = IRanges::subsetByOverlaps(ENCODE_H3K27ac, peakList[[i]], invert = TRUE)
not_in_CT_H3K27me3 = IRanges::subsetByOverlaps(ENCODE_H3K27me3, peakList[[i]], invert = TRUE)
ENCODE_H3K27ac_peaks_not_in_CT_set = c(ENCODE_H3K27ac_peaks_not_in_CT_set, not_in_CT_H3K27ac)
ENCODE_H3K27me3_peaks_not_in_CT_set = c(ENCODE_H3K27me3_peaks_not_in_CT_set, not_in_CT_H3K27me3)
}
ENCODE_H3K27ac_peaks_not_in_CT_set = setNames(ENCODE_H3K27ac_peaks_not_in_CT_set, sampleList_all)
ENCODE_H3K27me3_peaks_not_in_CT_set = setNames(ENCODE_H3K27me3_peaks_not_in_CT_set, sampleList_all)ENCODE_overlaps = Total_captured_ENCODE_peaks %>% left_join(Total_CT_peaks, by = "Sample") %>% left_join(CT_in_ENCODE, by = "Sample")
ENCODE_overlaps$Percentage_of_CT_in_ENCODE_H3K27ac = ENCODE_overlaps$CT_peaks_in_ENCODE_H3K27ac / ENCODE_overlaps$Total_PeakN * 100
ENCODE_overlaps$Percentage_of_total_ENCODE_H3K27ac = ENCODE_overlaps$ENCODE_H3K27ac_captured_peaks / length(ENCODE_H3K27ac) * 100
ENCODE_overlaps$Percentage_of_CT_in_ENCODE_H3K27me3 = ENCODE_overlaps$CT_peaks_in_ENCODE_H3K27me3 / ENCODE_overlaps$Total_PeakN * 100
ENCODE_overlaps$Percentage_of_total_ENCODE_H3K27me3 = ENCODE_overlaps$ENCODE_H3K27me3_captured_peaks / length(ENCODE_H3K27me3) * 100
ENCODE_overlaps$Percentage_of_CT_in_ENCODE_H3K27ac_withDup = ENCODE_overlaps$CT_peaks_in_ENCODE_H3K27ac_withDup / ENCODE_overlaps$Total_PeakN_withDup * 100
ENCODE_overlaps$Percentage_of_total_ENCODE_H3K27ac_withDup = ENCODE_overlaps$ENCODE_H3K27ac_captured_peaks_withDup / length(ENCODE_H3K27ac) * 100
ENCODE_overlaps$Percentage_of_CT_in_ENCODE_H3K27me3_withDup = ENCODE_overlaps$CT_peaks_in_ENCODE_H3K27me3_withDup / ENCODE_overlaps$Total_PeakN_withDup * 100
ENCODE_overlaps$Percentage_of_total_ENCODE_H3K27me3_withDup = ENCODE_overlaps$ENCODE_H3K27me3_captured_peaks_withDup / length(ENCODE_H3K27me3) * 100
ENCODE_overlapsENCODE_overlaps_H3K27ac = ENCODE_overlaps[c("Sample", "Percentage_of_CT_in_ENCODE_H3K27ac", "Percentage_of_total_ENCODE_H3K27ac")]
Percentage_of_CT_in_ENCODE_H3K27ac_SEACR = ENCODE_overlaps_H3K27ac[c(seq(1, nrow(ENCODE_overlaps_H3K27ac), by = 2)),2]
Percentage_of_CT_in_ENCODE_H3K27ac_MACS2 = ENCODE_overlaps_H3K27ac[c(seq(2, nrow(ENCODE_overlaps_H3K27ac), by = 2)),2]
Percentage_of_CT_in_ENCODE_H3K27ac_summary = data.frame(Sample = sampleList, Percentage_of_CT_in_ENCODE_H3K27ac_SEACR = Percentage_of_CT_in_ENCODE_H3K27ac_SEACR, Percentage_of_CT_in_ENCODE_H3K27ac_MACS2 = Percentage_of_CT_in_ENCODE_H3K27ac_MACS2)
Percentage_of_total_ENCODE_H3K27ac_SEACR = ENCODE_overlaps_H3K27ac[c(seq(1, nrow(ENCODE_overlaps_H3K27ac), by = 2)),3]
Percentage_of_total_ENCODE_H3K27ac_MACS2 = ENCODE_overlaps_H3K27ac[c(seq(2, nrow(ENCODE_overlaps_H3K27ac), by = 2)),3]
Percentage_of_total_ENCODE_H3K27ac_summary = data.frame(Sample = sampleList, Percentage_of_total_ENCODE_H3K27ac_SEACR = Percentage_of_total_ENCODE_H3K27ac_SEACR, Percentage_of_total_ENCODE_H3K27ac_MACS2 = Percentage_of_total_ENCODE_H3K27ac_MACS2)
ENCODE_overlaps_H3K27me3 = ENCODE_overlaps[c("Sample", "Percentage_of_CT_in_ENCODE_H3K27me3", "Percentage_of_total_ENCODE_H3K27me3")]
Percentage_of_CT_in_ENCODE_H3K27me3_SEACR = ENCODE_overlaps_H3K27me3[c(seq(1, nrow(ENCODE_overlaps_H3K27me3), by = 2)),2]
Percentage_of_CT_in_ENCODE_H3K27me3_MACS2 = ENCODE_overlaps_H3K27me3[c(seq(2, nrow(ENCODE_overlaps_H3K27me3), by = 2)),2]
Percentage_of_CT_in_ENCODE_H3K27me3_summary = data.frame(Sample = sampleList, Percentage_of_CT_in_ENCODE_H3K27me3_SEACR = Percentage_of_CT_in_ENCODE_H3K27me3_SEACR, Percentage_of_CT_in_ENCODE_H3K27me3_MACS2 = Percentage_of_CT_in_ENCODE_H3K27me3_MACS2)
Percentage_of_total_ENCODE_H3K27me3_SEACR = ENCODE_overlaps_H3K27me3[c(seq(1, nrow(ENCODE_overlaps_H3K27me3), by = 2)),3]
Percentage_of_total_ENCODE_H3K27me3_MACS2 = ENCODE_overlaps_H3K27me3[c(seq(2, nrow(ENCODE_overlaps_H3K27me3), by = 2)),3]
Percentage_of_total_ENCODE_H3K27me3_summary = data.frame(Sample = sampleList, Percentage_of_total_ENCODE_H3K27me3_SEACR = Percentage_of_total_ENCODE_H3K27me3_SEACR, Percentage_of_total_ENCODE_H3K27me3_MACS2 = Percentage_of_total_ENCODE_H3K27me3_MACS2)
ENCODE_overlaps_H3K27ac_withDup = ENCODE_overlaps[c("Sample", "Percentage_of_CT_in_ENCODE_H3K27ac_withDup", "Percentage_of_total_ENCODE_H3K27ac_withDup")]
Percentage_of_CT_in_ENCODE_H3K27ac_withDup_SEACR = ENCODE_overlaps_H3K27ac_withDup[c(seq(1, nrow(ENCODE_overlaps_H3K27ac_withDup), by = 2)),2]
Percentage_of_CT_in_ENCODE_H3K27ac_withDup_MACS2 = ENCODE_overlaps_H3K27ac_withDup[c(seq(2, nrow(ENCODE_overlaps_H3K27ac_withDup), by = 2)),2]
Percentage_of_CT_in_ENCODE_H3K27ac_withDup_summary = data.frame(Sample = sampleList, Percentage_of_CT_in_ENCODE_H3K27ac_withDup_SEACR = Percentage_of_CT_in_ENCODE_H3K27ac_withDup_SEACR, Percentage_of_CT_in_ENCODE_H3K27ac_withDup_MACS2 = Percentage_of_CT_in_ENCODE_H3K27ac_withDup_MACS2)
Percentage_of_total_ENCODE_H3K27ac_withDup_SEACR = ENCODE_overlaps_H3K27ac_withDup[c(seq(1, nrow(ENCODE_overlaps_H3K27ac_withDup), by = 2)),3]
Percentage_of_total_ENCODE_H3K27ac_withDup_MACS2 = ENCODE_overlaps_H3K27ac_withDup[c(seq(2, nrow(ENCODE_overlaps_H3K27ac_withDup), by = 2)),3]
Percentage_of_total_ENCODE_H3K27ac_withDup_summary = data.frame(Sample = sampleList, Percentage_of_total_ENCODE_H3K27ac_withDup_SEACR = Percentage_of_total_ENCODE_H3K27ac_withDup_SEACR, Percentage_of_total_ENCODE_H3K27ac_withDup_MACS2 = Percentage_of_total_ENCODE_H3K27ac_withDup_MACS2)
ENCODE_overlaps_H3K27me3_withDup = ENCODE_overlaps[c("Sample", "Percentage_of_CT_in_ENCODE_H3K27me3_withDup", "Percentage_of_total_ENCODE_H3K27me3_withDup")]
Percentage_of_CT_in_ENCODE_H3K27me3_withDup_SEACR = ENCODE_overlaps_H3K27me3_withDup[c(seq(1, nrow(ENCODE_overlaps_H3K27me3_withDup), by = 2)),2]
Percentage_of_CT_in_ENCODE_H3K27me3_withDup_MACS2 = ENCODE_overlaps_H3K27me3_withDup[c(seq(2, nrow(ENCODE_overlaps_H3K27me3_withDup), by = 2)),2]
Percentage_of_CT_in_ENCODE_H3K27me3_withDup_summary = data.frame(Sample = sampleList, Percentage_of_CT_in_ENCODE_H3K27me3_withDup_SEACR = Percentage_of_CT_in_ENCODE_H3K27me3_withDup_SEACR, Percentage_of_CT_in_ENCODE_H3K27me3_withDup_MACS2 = Percentage_of_CT_in_ENCODE_H3K27me3_withDup_MACS2)
Percentage_of_total_ENCODE_H3K27me3_withDup_SEACR = ENCODE_overlaps_H3K27me3_withDup[c(seq(1, nrow(ENCODE_overlaps_H3K27me3_withDup), by = 2)),3]
Percentage_of_total_ENCODE_H3K27me3_withDup_MACS2 = ENCODE_overlaps_H3K27me3_withDup[c(seq(2, nrow(ENCODE_overlaps_H3K27me3_withDup), by = 2)),3]
Percentage_of_total_ENCODE_H3K27me3_withDup_summary = data.frame(Sample = sampleList, Percentage_of_total_ENCODE_H3K27me3_withDup_SEACR = Percentage_of_total_ENCODE_H3K27me3_withDup_SEACR, Percentage_of_total_ENCODE_H3K27me3_withDup_MACS2 = Percentage_of_total_ENCODE_H3K27me3_withDup_MACS2)Precision: % of CUT&Tag peaks in ENCODE Recall: % of ENCODE peaks captured
f_measure = data.frame(Sample = sampleList_all)
f_measure$true_pos = lapply(ENCODE_H3K27ac_peaks_in_CT_set, length) %>% unname() %>% unlist()
f_measure$false_pos = lapply(CT_peaks_not_in_ENCODE_H3K27ac_set, length) %>% unname() %>% unlist()
f_measure$false_neg = lapply(ENCODE_H3K27ac_peaks_not_in_CT_set, length) %>% unname() %>% unlist()
f_measure$F1 = f_measure$true_pos/(f_measure$true_pos + 0.5*(f_measure$false_pos + f_measure$false_neg))f_measure_F1_SEACR_list = f_measure[c(seq(1, nrow(f_measure), by = 2)), 5]
f_measure_F1_MACS2_list = f_measure[c(seq(2, nrow(f_measure), by = 2)), 5]
f_measure_F1 = data.frame(Sample = sampleList, SEACR_F1 = f_measure_F1_SEACR_list, MACS2_F1 = f_measure_F1_MACS2_list)
f_measure_F1_melt = melt(f_measure_F1, id.vars = "Sample")
F1_plot = ggplot(f_measure_F1_melt, aes(Sample, value)) +
geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
theme_bw(base_size = 15) +
ylab("F1 score") +
xlab("") +
ggtitle("F-measures of precision and recall (ENCODE H3K27ac)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(legend.position = "bottom") +
scale_fill_discrete(name = "Peak caller", labels = c("SEACR", "MACS2"))
F1_plotPercentage_of_CT_in_ENCODE_H3K27ac_summary_melt = melt(Percentage_of_CT_in_ENCODE_H3K27ac_summary, id.vars = "Sample")
Peaks_in_ENCODE_plot = ggplot(Percentage_of_CT_in_ENCODE_H3K27ac_summary_melt, aes(Sample, value)) +
theme_bw() +
geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
ylim(0, 100) +
ylab("% of peaks in ENCODE") +
xlab("") +
ggtitle("Proportion of CUT&Tag peaks falling into ENCODE H3K27ac") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(axis.text = element_text(size = 15)) +
theme(legend.position = "bottom") +
scale_fill_discrete(name = "Peak caller", labels = c("SEACR", "MACS2")) +
theme(plot.margin = unit(c(1,1,1,1), "cm"))
Peaks_in_ENCODE_plotPercentage_of_CT_in_ENCODE_H3K27ac_withDup_summary_melt = melt(Percentage_of_CT_in_ENCODE_H3K27ac_withDup_summary, id.vars = "Sample")
Peaks_in_ENCODE_plot_withDup = ggplot(Percentage_of_CT_in_ENCODE_H3K27ac_withDup_summary_melt, aes(Sample, value)) +
theme_bw() +
geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
ylim(0, 100) +
ylab("% of peaks in ENCODE") +
xlab("") +
ggtitle("Proportion of CUT&Tag peaks falling into ENCODE H3K27ac (with duplicates)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(legend.position = "bottom") +
scale_fill_discrete(name = "Peak caller", labels = c("SEACR", "MACS2")) +
theme(plot.margin = unit(c(1,1,1,1), "cm"))
Peaks_in_ENCODE_plot_withDupmaster_qvals_df = c()
for(i in 1:length(peakList)){
df_captured = data.frame("Sample" = sampleList_all[i], "-log10(qval)" = ENCODE_H3K27ac_peaks_in_CT_set[[i]]$V9, "Captured" = "Captured", check.names = FALSE)
df_missed = data.frame("Sample" = sampleList_all[i], "-log10(qval)" = ENCODE_H3K27ac_peaks_not_in_CT_set[[i]]$V9, "Captured" = "Missed", check.names = FALSE)
master_qvals_df = rbind(master_qvals_df, rbind(df_captured, df_missed))
}
master_qvals_df_melt = melt(master_qvals_df, id.vars = c("Sample", "Captured"))ggplot(master_qvals_df_melt, aes(x = value, y = Sample, fill = Captured)) +
theme_bw() +
stat_boxplot(geom = "errorbar") +
geom_boxplot(outlier.shape = NA) +
coord_cartesian(xlim = c(1, 250)) +
ylab("") +
xlab("-log10(q-val)") +
labs(fill = "ENCODE H3K27ac peaks") +
theme(legend.position = "bottom") +
theme(text = element_text(size = 25))tests = list()
for(i in 1:length(sampleList_all)){
captured = master_qvals_df[(master_qvals_df$Sample == sampleList_all[i] & master_qvals_df$Captured == "Captured"),]
missed = master_qvals_df[(master_qvals_df$Sample == sampleList_all[i] & master_qvals_df$Captured == "Missed"),]
var_test = var.test(captured$`-log10(qval)`, missed$`-log10(qval)`)
var = ifelse(var_test$p.value < 0.05, FALSE, TRUE)
test = t.test(captured$`-log10(qval)`, missed$`-log10(qval)`, var.equal = var, paired = FALSE, alternative = c("two.sided"))
#tests[i] = ifelse(test$p.value < 0.05, "significant", "insignificant")
tests[[i]] = test
}
names(tests) = sampleList_all
tests## $Diagenode_C15410196_1_50_SEACR
##
## Welch Two Sample t-test
##
## data: captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 83.354, df = 21985, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 47.59428 49.88656
## sample estimates:
## mean of x mean of y
## 65.92364 17.18322
##
##
## $Diagenode_C15410196_1_50_MACS2
##
## Welch Two Sample t-test
##
## data: captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 93.556, df = 18945, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 56.95664 59.39429
## sample estimates:
## mean of x mean of y
## 73.89533 15.71987
##
##
## $CST9733_1_100_H3K27me3_SEACR
##
## Welch Two Sample t-test
##
## data: captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 6.0819, df = 942.89, p-value = 1.724e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 11.36836 22.20002
## sample estimates:
## mean of x mean of y
## 51.93975 35.15557
##
##
## $CST9733_1_100_H3K27me3_MACS2
##
## Welch Two Sample t-test
##
## data: captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 10.153, df = 1374.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 18.73792 27.71304
## sample estimates:
## mean of x mean of y
## 58.07635 34.85088
Percentage_of_CT_in_ENCODE_H3K27me3_summary_melt = melt(Percentage_of_CT_in_ENCODE_H3K27me3_summary, id.vars = "Sample")
Peaks_in_ENCODE_H3K27me3_plot = ggplot(Percentage_of_CT_in_ENCODE_H3K27me3_summary_melt, aes(Sample, value)) +
theme_bw() +
geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
ylim(0, 100) +
ylab("% of peaks in ENCODE") +
xlab("") +
ggtitle("Proportion of CUT&Tag peaks falling into ENCODE H3K27me3") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(legend.position = "bottom") +
scale_fill_discrete(name = "Peak caller", labels = c("SEACR", "MACS2")) +
theme(plot.margin = unit(c(1,1,1,1), "cm"))
Peaks_in_ENCODE_H3K27me3_plotPercentage_of_CT_in_ENCODE_H3K27me3_withDup_summary_melt = melt(Percentage_of_CT_in_ENCODE_H3K27me3_withDup_summary, id.vars = "Sample")
Peaks_in_ENCODE_H3K27me3_plot_withDup = ggplot(Percentage_of_CT_in_ENCODE_H3K27me3_withDup_summary_melt, aes(Sample, value)) +
theme_bw() +
geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
ylim(0, 100) +
ylab("% of peaks in ENCODE") +
xlab("") +
ggtitle("Proportion of CUT&Tag peaks falling into ENCODE H3K27me3 (with duplicates)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(axis.text = element_text(size = 15)) +
theme(legend.position = "bottom") +
scale_fill_discrete(name = "Peak caller", labels = c("SEACR", "MACS2")) +
theme(plot.margin = unit(c(1,1,1,1), "cm"))
Peaks_in_ENCODE_H3K27me3_plot_withDupPercentage_of_total_ENCODE_H3K27ac_summary_melt = melt(Percentage_of_total_ENCODE_H3K27ac_summary, id.vars = "Sample")
ENCODE_in_CT_plot = ggplot(Percentage_of_total_ENCODE_H3K27ac_summary_melt, aes(Sample, value)) +
theme_bw(base_size = 15) +
geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
ylim(0, 100) +
ylab("% of ENCODE peaks captured") +
xlab("Sample") +
ggtitle("Proportion of ENCODE H3K27ac peaks captured by CUT&Tag") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(legend.position = "bottom") +
scale_fill_discrete(name = "Peak caller", labels = c("SEACR", "MACS2"))
ENCODE_in_CT_plotPercentage_of_total_ENCODE_H3K27ac_withDup_summary_melt = melt(Percentage_of_total_ENCODE_H3K27ac_withDup_summary, id.vars = "Sample")
ENCODE_in_CT_plot_withDup = ggplot(Percentage_of_total_ENCODE_H3K27ac_withDup_summary_melt, aes(Sample, value)) +
theme_bw(base_size = 15) +
geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
ylim(0, 100) +
ylab("% of ENCODE peaks captured") +
xlab("Sample") +
ggtitle("Proportion of ENCODE H3K27ac peaks captured by CUT&Tag (with duplicates)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(legend.position = "bottom") +
scale_fill_discrete(name = "Peak caller", labels = c("SEACR", "MACS2"))
ENCODE_in_CT_plot_withDupPercentage_of_total_ENCODE_H3K27me3_summary_melt = melt(Percentage_of_total_ENCODE_H3K27me3_summary, id.vars = "Sample")
ENCODE_in_CT_H3K27me3_plot = ggplot(Percentage_of_total_ENCODE_H3K27me3_summary_melt, aes(Sample, value)) +
theme_bw(base_size = 15) +
geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
ylim(0, 100) +
ylab("% of ENCODE peaks captured") +
xlab("") +
ggtitle("Proportion of ENCODE H3K27me3 peaks captured by CUT&Tag") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(legend.position = "bottom") +
scale_fill_discrete(name = "Peak caller", labels = c("SEACR", "MACS2"))
ENCODE_in_CT_H3K27me3_plotPercentage_of_total_ENCODE_H3K27me3_withDup_summary_melt = melt(Percentage_of_total_ENCODE_H3K27me3_withDup_summary, id.vars = "Sample")
ENCODE_in_CT_H3K27me3_plot_withDup = ggplot(Percentage_of_total_ENCODE_H3K27me3_withDup_summary_melt, aes(Sample, value)) +
theme_bw(base_size = 15) +
geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
ylim(0, 100) +
ylab("% of ENCODE peaks captured") +
xlab("") +
ggtitle("Proportion of ENCODE H3K27me3 peaks captured by CUT&Tag (with duplicates)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(legend.position = "bottom") +
scale_fill_discrete(name = "Peak caller", labels = c("SEACR", "MACS2"))
ENCODE_in_CT_H3K27me3_plot_withDupAll samples scaled to 2 million reads
Heatmap_hg19
All samples scaled to 2 million total reads
chrHMM_url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeBroadHmm/wgEncodeBroadHmmK562HMM.bed.gz"
#chrHMM_url = "https://www.encodeproject.org/files/ENCFF163QUM/@@download/ENCFF163QUM.bed.gz"
chrHMM = genomation::readBed(chrHMM_url)## Rows: 2 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): X1, X4, X6
## dbl (5): X2, X3, X5, X7, X8
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
chrHMM_list = GenomicRanges::split(chrHMM, chrHMM$name, drop = TRUE)
chrHMM_names = c("Transcription elongation [10]", "Weak transcription [11]", "Repressed [12]", "Heterochromatin [13]", "Repetitive [14]", "Repetitive [15]", "Active promoter [1]", "Weak promoter [2]", "Poised promoter [3]", "Strong enhancer [4]", "Strong enhancer [5]", "Weak enhancer [6]", "Weak enhancer [7]", "Insulator [8]", "Transcription transition [9]")
names(chrHMM_list) = chrHMM_namesannotation = genomation::annotateWithFeatures(GenomicRanges::GRangesList(peakList_with_ENCODE), chrHMM_list)
matrix = genomation::heatTargetAnnotation(annotation, plot = FALSE, precedence = FALSE)
rownames(matrix) = sampleList_all_with_ENCODE
colnames(matrix) = chrHMM_names
matrix_melt = melt(matrix)
colnames(matrix_melt) = c("Sample", "State", "% of peaks")
ggplot(matrix_melt) +
theme_minimal() +
geom_tile(aes(x = State, y = Sample, fill = `% of peaks`)) +
ylab("") +
scale_fill_viridis(option = "plasma") +
ggpubr::rotate_x_text(angle = 45) +
theme(text = element_text(size = 15)) +
theme(legend.key.size = unit(1, "cm")) Obtain duplicate-only peaks:
duplicate_only_peaks = c()
for(i in 1:length(peakList)){
dup_only = IRanges::subsetByOverlaps(peakList_withDup[[i]], peakList[[i]], invert = TRUE)
duplicate_only_peaks = c(duplicate_only_peaks, dup_only)
}
names(duplicate_only_peaks) = names(peakList)annotation_dup_only = genomation::annotateWithFeatures(GenomicRanges::GRangesList(duplicate_only_peaks), chrHMM_list)
matrix = genomation::heatTargetAnnotation(annotation_dup_only, plot = FALSE)
rownames(matrix) = sampleList_all
colnames(matrix) = chrHMM_names
matrix_melt = melt(matrix)
colnames(matrix_melt) = c("Sample", "State", "% of peaks")
ggplot(matrix_melt) +
theme_minimal() +
geom_tile(aes(x = State, y = Sample, fill = `% of peaks`)) +
ylab("") +
scale_fill_viridis(option = "plasma") +
ggpubr::rotate_x_text(angle = 45) +
theme(text = element_text(size = 15)) +
theme(legend.key.size = unit(1, "cm")) annotation_in_ENCODE_H3K27ac = genomation::annotateWithFeatures(GenomicRanges::GRangesList(CT_peaks_in_ENCODE_H3K27ac_set), chrHMM_list)
matrix = genomation::heatTargetAnnotation(annotation_in_ENCODE_H3K27ac, plot = FALSE)
rownames(matrix) = sampleList_all
colnames(matrix) = chrHMM_names
matrix_melt = melt(matrix)
colnames(matrix_melt) = c("Sample", "State", "% of peaks")
ggplot(matrix_melt) +
theme_minimal() +
geom_tile(aes(x = State, y = Sample, fill = `% of peaks`)) +
ylab("") +
scale_fill_viridis(option = "plasma") +
ggpubr::rotate_x_text(angle = 45) +
theme(text = element_text(size = 15)) +
theme(legend.key.size = unit(1, "cm")) annotation_not_in_ENCODE_H3K27ac = genomation::annotateWithFeatures(GenomicRanges::GRangesList(CT_peaks_not_in_ENCODE_H3K27ac_set), chrHMM_list)
matrix = genomation::heatTargetAnnotation(annotation_not_in_ENCODE_H3K27ac, plot = FALSE)
rownames(matrix) = sampleList_all
colnames(matrix) = chrHMM_names
matrix_melt = melt(matrix)
colnames(matrix_melt) = c("Sample", "State", "% of peaks")
ggplot(matrix_melt) +
theme_minimal() +
geom_tile(aes(x = State, y = Sample, fill = `% of peaks`)) +
ylab("") +
scale_fill_viridis(option = "plasma") +
ggpubr::rotate_x_text(angle = 45) +
theme(text = element_text(size = 15)) +
theme(legend.key.size = unit(1, "cm")) annotation_in_CT_H3K27ac = genomation::annotateWithFeatures(GenomicRanges::GRangesList(ENCODE_H3K27ac_peaks_in_CT_set), chrHMM_list)
matrix = genomation::heatTargetAnnotation(annotation_in_CT_H3K27ac, plot = FALSE)
colnames(matrix) = chrHMM_names
matrix_melt = melt(matrix)
colnames(matrix_melt) = c("Sample", "State", "% of peaks")
ggplot(matrix_melt) +
theme_minimal() +
geom_tile(aes(x = State, y = Sample, fill = `% of peaks`)) +
ylab("") +
scale_fill_viridis(option = "plasma") +
ggpubr::rotate_x_text(angle = 45) +
theme(text = element_text(size = 15)) +
theme(legend.key.size = unit(1, "cm")) annotation_not_in_CT_H3K27ac = genomation::annotateWithFeatures(GenomicRanges::GRangesList(ENCODE_H3K27ac_peaks_not_in_CT_set), chrHMM_list)
matrix = genomation::heatTargetAnnotation(annotation_not_in_CT_H3K27ac, plot = FALSE)
colnames(matrix) = chrHMM_names
matrix_melt = melt(matrix)
colnames(matrix_melt) = c("Sample", "State", "% of peaks")
ggplot(matrix_melt) +
theme_minimal() +
geom_tile(aes(x = State, y = Sample, fill = `% of peaks`)) +
ylab("") +
scale_fill_viridis(option = "plasma") +
ggpubr::rotate_x_text(angle = 45) +
theme(text = element_text(size = 15)) +
theme(legend.key.size = unit(1, "cm")) # use original peaks (unfiltered)
SEACR_peakList = peakList_unfilt[seq(1, length(peakList), by = 2)]
MACS2_peakList = peakList_unfilt[seq(2, length(peakList), by = 2)]SEACR_only_peaks = list()
MACS2_only_peaks = list()
for(i in 1:length(SEACR_peakList)){
SEACR_only_peaks[[i]] = IRanges::subsetByOverlaps(SEACR_peakList[[i]], MACS2_peakList[[i]], invert = TRUE)
MACS2_only_peaks[[i]] = IRanges::subsetByOverlaps(MACS2_peakList[[i]], SEACR_peakList[[i]], invert = TRUE)
}
names(SEACR_only_peaks) = sampleList
names(MACS2_only_peaks) = sampleListSEACR_only_peaks_total = c()
SEACR_only_peaks_in_ENCODE = c()
MACS2_only_peaks_total = c()
MACS2_only_peaks_in_ENCODE = c()
for(i in 1:length(SEACR_only_peaks)){
SEACR_only_peaks_in_ENCODE[i] = IRanges::subsetByOverlaps(SEACR_only_peaks[[i]], ENCODE_H3K27ac) %>% length()
SEACR_only_peaks_total[i] = length(SEACR_only_peaks[[i]])
MACS2_only_peaks_in_ENCODE[i] = IRanges::subsetByOverlaps(MACS2_only_peaks[[i]], ENCODE_H3K27ac) %>% length()
MACS2_only_peaks_total[i] = length(MACS2_only_peaks[[i]])
}
caller_specific_peaks = data.frame(Sample = names(SEACR_peakList), SEACR_only_peaks_in_ENCODE = SEACR_only_peaks_in_ENCODE, SEACR_only_peaks_total = SEACR_only_peaks_total, MACS2_only_peaks_in_ENCODE = MACS2_only_peaks_in_ENCODE, MACS2_only_peaks_total = MACS2_only_peaks_total)
caller_specific_peaks$SEACR_only_peaks_in_ENCODE_percentage = caller_specific_peaks$SEACR_only_peaks_in_ENCODE / caller_specific_peaks$SEACR_only_peaks_total * 100
caller_specific_peaks$MACS2_only_peaks_in_ENCODE_percentage = caller_specific_peaks$MACS2_only_peaks_in_ENCODE / caller_specific_peaks$MACS2_only_peaks_total * 100
caller_specific_peaksannotation = genomation::annotateWithFeatures(GenomicRanges::GRangesList(SEACR_only_peaks), chrHMM_list)
matrix = genomation::heatTargetAnnotation(annotation, plot = FALSE)
rownames(matrix) = names(SEACR_only_peaks)
colnames(matrix) = chrHMM_names
matrix_melt = melt(matrix)
colnames(matrix_melt) = c("Sample", "State", "% of peaks")
ggplot(matrix_melt) +
theme_minimal() +
geom_tile(aes(x = State, y = Sample, fill = `% of peaks`)) +
ylab("") +
scale_fill_viridis(option = "plasma") +
ggpubr::rotate_x_text(angle = 45) +
theme(text = element_text(size = 25)) +
theme(legend.key.size = unit(1, "cm")) annotation = genomation::annotateWithFeatures(GenomicRanges::GRangesList(MACS2_only_peaks), chrHMM_list)
matrix = genomation::heatTargetAnnotation(annotation, plot = FALSE)
rownames(matrix) = names(MACS2_only_peaks)
colnames(matrix) = chrHMM_names
matrix_melt = melt(matrix)
colnames(matrix_melt) = c("Sample", "State", "% of peaks")
ggplot(matrix_melt) +
theme_minimal() +
geom_tile(aes(x = State, y = Sample, fill = `% of peaks`)) +
ylab("") +
scale_fill_viridis(option = "plasma") +
ggpubr::rotate_x_text(angle = 45) +
theme(text = element_text(size = 25)) +
theme(legend.key.size = unit(1, "cm")) peakAnnoList = lapply(peakList_with_ENCODE, annotatePeak, TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene, tssRegion = c(-3000, 3000), verbose = FALSE)
annobar = ChIPseeker::plotAnnoBar(peakAnnoList)
annobarpeakAnnoList = lapply(peakList_with_ENCODE, annotatePeak, TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene, tssRegion = c(-3000, 3000), verbose = FALSE)
genes = lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
names(genes) = sub("_", "\n", names(genes))
compKEGG = clusterProfiler::compareCluster(geneCluster = genes,
fun = "enrichKEGG",
pvalueCutoff = 0.05,
pAdjustMethod = "BH")
KEGG_plot = clusterProfiler::dotplot(compKEGG, showCategory = 10, title = "KEGG Pathway Enrichment Analysis") +
theme_bw(base_size = 13) +
ggpubr::rotate_x_text(angle = 45) +
scale_size_area(max_size = 10)
KEGG_plotpeakAnnoList = lapply(peakList_with_ENCODE, annotatePeak, TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene, tssRegion = c(-3000, 3000), verbose = FALSE)
genes = lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
names(genes) = sub("_", "\n", names(genes))
compGO = clusterProfiler::compareCluster(geneCluster = genes,
OrgDb = "org.Hs.eg.db",
fun = "enrichGO",
pvalueCutoff = 0.05,
pAdjustMethod = "BH")
GO_plot = clusterProfiler::dotplot(compGO, showCategory = 10, title = "GO Enrichment Analysis") +
theme_bw(base_size = 13) +
ggpubr::rotate_x_text(angle = 45) +
scale_size_area(max_size = 10)
GO_plotpathways = list()
for(peak in peakList_with_ENCODE){
gene = ChIPseeker::seq2gene(peak, tssRegion = c(-3000, 3000), flankDistance = 3000, TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene)
pathway = ReactomePA::enrichPathway(gene)
pathways = c(pathways, pathway)
}
names(pathways) = names(peakList_with_ENCODE)pathways = list()
geneList = list()
for(i in 1:length(peakList_with_ENCODE)){
gene = ChIPseeker::seq2gene(peakList_with_ENCODE[[i]], tssRegion = c(-3000, 3000), flankDistance = 3000, TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene)
geneList[[i]] = gene
pathway = ReactomePA::enrichPathway(gene)
pathways[[i]] = pathway
}
names(pathways) = names(peakList_with_ENCODE)
names(geneList) = names(peakList_with_ENCODE)ReactomePA_comp = clusterProfiler::compareCluster(geneList, fun = "enrichPathway")
clusterProfiler::dotplot(ReactomePA_comp, showCategory = 10) +
theme_bw(base_size = 13) +
ggtitle("Reactome Pathway Analysis") +
ggpubr::rotate_x_text(angle = 45) +
scale_size_area(max_size = 10)motifs = list()
for(i in 1:length(sampleList_all_with_ENCODE)){
motif = marge::read_known_results(path = paste0("/Volumes/test_data/motifs/", sampleList_all_with_ENCODE[i]), homer_dir = TRUE)
motifs[[i]] = motif[order(-motif$log_p_value), ]
}
motifs = setNames(motifs, sampleList_all_with_ENCODE)
#in some cases top -log(p) values are infinite (especially for ENCODE H3K27ac), so will set them to the maximum -log(p-value) attained by other samples
for(i in 1:length(sampleList_all_with_ENCODE)){
motifs[[i]]$log_p_value[(motifs[[i]]$log_p_value == "Inf")] = 300
}ATAC-seq libraries: ENCLB918NXF, ENCLB758GEG from https://www.encodeproject.org/experiments/ENCSR483RKN/
# did not evaluate
bamFile = "/Volumes/test_data/resources/ATAC/bam/K562_ATACseq_R1.mLb.clN.sorted.bam"
total_ATAC_reads = 49677798
ATAC_in_captured_ENCODE_inPeakData = c()
for(i in 1:length(sampleList_all)){
total_bases = sum(width(ENCODE_H3K27ac_peaks_in_CT_set[[i]]))
peak = data.frame(ENCODE_H3K27ac_peaks_in_CT_set[[i]])
peak.gr = GenomicRanges::GRanges(seqnames = peak$seqnames, IRanges(start = peak$start, end = peak$end), strand = "*")
fragment_counts = chromVAR::getCounts(bamFile, peak.gr, paired = FALSE, by_rg = FALSE, format = "bam")
ATAC_in_captured_ENCODE_inPeakN = counts(fragment_counts)[,1] %>% sum
ATAC_in_captured_ENCODE_inPeakData = rbind(ATAC_in_captured_ENCODE_inPeakData, data.frame(Sample = sampleList_all[i], ATAC_reads_in_captured_peaks = ATAC_in_captured_ENCODE_inPeakN, Total_bases = total_bases))
}
# adjust FRiPs for total coverage of ENCODE peak set and multiplied by constant
ATAC_in_captured_ENCODE_inPeakData$measure_ATAC_in_captured_ENCODE = ATAC_in_captured_ENCODE_inPeakData$ATAC_reads_in_captured_peaks/total_ATAC_reads * (10^9/ATAC_in_captured_ENCODE_inPeakData$Total_bases)
ATAC_in_captured_ENCODE_inPeakData# did not evaluate
ATAC_in_missed_ENCODE_inPeakData = c()
for(i in 1:length(sampleList_all)){
total_bases = sum(width(ENCODE_H3K27ac_peaks_not_in_CT_set[[i]]))
peak = data.frame(ENCODE_H3K27ac_peaks_not_in_CT_set[[i]])
peak.gr = GenomicRanges::GRanges(seqnames = peak$seqnames, IRanges(start = peak$start, end = peak$end), strand = "*")
fragment_counts = chromVAR::getCounts(bamFile, peak.gr, paired = FALSE, by_rg = FALSE, format = "bam")
ATAC_in_missed_ENCODE_inPeakN = counts(fragment_counts)[,1] %>% sum
ATAC_in_missed_ENCODE_inPeakData = rbind(ATAC_in_missed_ENCODE_inPeakData, data.frame(Sample = sampleList_all[i], ATAC_reads_in_missed_peaks = ATAC_in_missed_ENCODE_inPeakN, Total_bases = total_bases))
}
# adjusted FRiPs for total coverage of ENCODE peak set and multiplied by constant
ATAC_in_missed_ENCODE_inPeakData$measure_ATAC_in_missed_ENCODE = ATAC_in_missed_ENCODE_inPeakData$ATAC_reads_in_missed_peaks/total_ATAC_reads * (10^9/ATAC_in_missed_ENCODE_inPeakData$Total_bases)
ATAC_in_missed_ENCODE_inPeakData# did not evaluate
ATAC_reads_in_ENCODE_summary = data.frame(Sample = labels_SEACR_MACS2, ATAC_in_captured_ENCODE = ATAC_in_captured_ENCODE_inPeakData$measure_ATAC_in_captured_ENCODE, ATAC_in_missed_ENCODE = ATAC_in_missed_ENCODE_inPeakData$measure_ATAC_in_missed_ENCODE)
ATAC_reads_in_ENCODE_summary_melt = melt(ATAC_reads_in_ENCODE_summary, id.vars = "Sample")
ATAC_reads_in_ENCODE_plot = ggplot(ATAC_reads_in_ENCODE_summary_melt, aes(Sample, value)) +
geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
theme_bw() +
ylab("ATAC reads in ENCODE (adj %)") +
xlab("") +
ggtitle("ATAC reads in ENCODE H3K27ac peaks captured vs missed by CUT&Tag") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(legend.position = "bottom") +
scale_fill_discrete(name = "", labels = c("Reads in captured ENCODE peaks", "Reads in missed ENCODE peaks"))
ATAC_reads_in_ENCODE_plot#STARRseq_rep1 = ChIPseeker::readPeakFile(paste0(dir, "/STARRseq_peaks_hg38_ENCFF717VJK.bed"), as = "GRanges")
#STARRseq_rep2 = ChIPseeker::readPeakFile(paste0(dir, "/STARRseq_peaks_hg38_ENCFF394DBM.bed"), as = "GRanges")
#STARRseq_hg38 = IRanges::subsetByOverlaps(STARRseq_rep1, STARRseq_rep2)
chainfile = rtracklayer::import.chain(system.file(package = "liftOver", "extdata", "hg38ToHg19.over.chain"))
#seqlevelsStyle(STARRseq_hg38) = "UCSC"
#STARRseq = rtracklayer::liftOver(STARRseq_hg38, chainfile) %>% unlist()
#write.table(data.frame(STARRseq), file = paste0(datadir, "/resources/STARRseq/STARRseq_hg19_replicated_peaks_ENCSR858MPS.bed"), sep = "\t", col.names = FALSE, row.names = FALSE)
STARRseq = ChIPseeker::readPeakFile(paste0(datadir, "/resources/STARRseq/STARRseq_hg19_replicated_peaks_ENCSR858MPS.bed"), as = "GRanges")
dir = "/Users/leylaabbasova/Documents/CUTnTag/analysis/STARRseq"
ENCODE_DNase = ChIPseeker::readPeakFile(paste0(dir, "/DNase_seq_hg38_ENCFF722NSO.bed"), as = "GRanges")
seqlevelsStyle(ENCODE_DNase) = "UCSC"
ENCODE_DNase = rtracklayer::liftOver(ENCODE_DNase, chainfile) %>% unlist()STARRseq_filt = IRanges::subsetByOverlaps(STARRseq, ENCODE_DNase)STARRseq_overlapping_peaks = c()
STARRseq_overlap_prop = c()
STARRseq_missed_peaks = c()
STARRseq_filt_overlapping_peaks = c()
STARRseq_filt_overlap_prop = c()
STARRseq_filt_missed_peaks = c()
for(i in 1:length(peakList_with_ENCODE)){
STARRseq_overlapping_peaks[[i]] = IRanges::subsetByOverlaps(STARRseq, peakList_with_ENCODE[[i]])
STARRseq_missed_peaks[[i]] = IRanges::subsetByOverlaps(STARRseq, peakList_with_ENCODE[[i]], invert = TRUE)
STARRseq_overlap_prop = c(STARRseq_overlap_prop, length(STARRseq_overlapping_peaks[[i]])/length(STARRseq))
STARRseq_filt_overlapping_peaks[[i]] = IRanges::subsetByOverlaps(STARRseq_filt, peakList_with_ENCODE[[i]])
STARRseq_filt_missed_peaks[[i]] = IRanges::subsetByOverlaps(STARRseq_filt, peakList_with_ENCODE[[i]], invert = TRUE)
STARRseq_filt_overlap_prop = c(STARRseq_filt_overlap_prop, length(STARRseq_filt_overlapping_peaks[[i]])/length(STARRseq_filt))
}
names(STARRseq_overlapping_peaks) = sampleList_all_with_ENCODE
names(STARRseq_overlap_prop) = sampleList_all_with_ENCODE
names(STARRseq_missed_peaks) = sampleList_all_with_ENCODE
names(STARRseq_filt_overlapping_peaks) = sampleList_all_with_ENCODE
names(STARRseq_filt_overlap_prop) = sampleList_all_with_ENCODE
names(STARRseq_filt_missed_peaks) = sampleList_all_with_ENCODE
STARRseq_peak_overlaps_df = data.frame("Sample" = sampleList_all_with_ENCODE,
"% captured STARR-seq peaks (all)" = (STARRseq_overlap_prop * 100),
"% captured STARR-seq peaks (filt)" = (STARRseq_filt_overlap_prop * 100),
"Total sample peaks" = lapply(peakList_with_ENCODE, length) %>% unlist %>% unname,
check.names = FALSE, row.names = NULL)
STARRseq_peak_overlaps_dfCheck -log10(qval) values of captured and missed STARR-seq peaks:
STARRseq_qvals_df = c()
for(i in 1:length(peakList_with_ENCODE)){
df_captured = data.frame("Sample" = sampleList_all_with_ENCODE[i], "-log10(qval)" = STARRseq_overlapping_peaks[[i]]$V12, "Captured" = "Captured", check.names = FALSE)
df_missed = data.frame("Sample" = sampleList_all_with_ENCODE[i], "-log10(qval)" = STARRseq_missed_peaks[[i]]$V12, "Captured" = "Missed", check.names = FALSE)
STARRseq_qvals_df = rbind(STARRseq_qvals_df, rbind(df_captured, df_missed))
}STARRseq_qvals_df_melt = melt(STARRseq_qvals_df, id.vars = c("Sample", "Captured"))ggplot(STARRseq_qvals_df_melt, aes(x = value, y = Sample, fill = Captured)) +
theme_bw() +
stat_boxplot(geom = "errorbar", coef = 1.5) +
geom_boxplot(outlier.shape = NA) +
coord_cartesian(xlim = c(1, 10)) +
scale_x_continuous(breaks = seq(1, 10, by = 1)) +
ylab("") +
xlab("-log10(qval)") +
labs(fill = "STARR-seq peaks") +
theme(legend.position = "bottom") +
theme(text = element_text(size = 15))tests = c()
for(i in 1:length(sampleList_all_with_ENCODE)){
captured = STARRseq_qvals_df[(STARRseq_qvals_df$Sample == sampleList_all_with_ENCODE[i] & STARRseq_qvals_df$Captured == "Captured"),]
missed = STARRseq_qvals_df[(STARRseq_qvals_df$Sample == sampleList_all_with_ENCODE[i] & STARRseq_qvals_df$Captured == "Missed"),]
var_test = var.test(captured$`-log10(qval)`, missed$`-log10(qval)`)
var = ifelse(var_test$p.value < 0.05, FALSE, TRUE)
test = t.test(captured$`-log10(qval)`, missed$`-log10(qval)`, var.equal = var, paired = FALSE, alternative = c("two.sided"))
tests[[i]] = test
}
names(tests) = sampleList_all_with_ENCODE
tests## $Diagenode_C15410196_1_50_SEACR
##
## Welch Two Sample t-test
##
## data: captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 18.18, df = 2011, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.8831205 1.0966932
## sample estimates:
## mean of x mean of y
## 3.239952 2.250045
##
##
## $Diagenode_C15410196_1_50_MACS2
##
## Welch Two Sample t-test
##
## data: captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 19.091, df = 2155.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.8915945 1.0957315
## sample estimates:
## mean of x mean of y
## 3.237487 2.243824
##
##
## $CST9733_1_100_H3K27me3_SEACR
##
## Welch Two Sample t-test
##
## data: captured$`-log10(qval)` and missed$`-log10(qval)`
## t = -9.0224, df = 1729.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3584189 -0.2304155
## sample estimates:
## mean of x mean of y
## 2.065323 2.359740
##
##
## $CST9733_1_100_H3K27me3_MACS2
##
## Welch Two Sample t-test
##
## data: captured$`-log10(qval)` and missed$`-log10(qval)`
## t = -9.0034, df = 1838.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3551120 -0.2280743
## sample estimates:
## mean of x mean of y
## 2.069078 2.360671
##
##
## $ENCODE_H3K27ac
##
## Welch Two Sample t-test
##
## data: captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 24.894, df = 5019.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.775484 0.908063
## sample estimates:
## mean of x mean of y
## 3.009353 2.167580
##
##
## $ENCODE_H3K27me3
##
## Welch Two Sample t-test
##
## data: captured$`-log10(qval)` and missed$`-log10(qval)`
## t = -5.2025, df = 9194.3, p-value = 2.008e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.16923144 -0.07660483
## sample estimates:
## mean of x mean of y
## 2.248104 2.371022
STARRseq_filt_qvals_df = c()
STARRseq_filt_overlapping_peaks_cut = STARRseq_filt_overlapping_peaks[names(STARRseq_filt_overlapping_peaks) != "ENCODE DNase"]
STARRseq_filt_missed_peaks_cut = STARRseq_filt_missed_peaks[names(STARRseq_filt_missed_peaks) != "ENCODE DNase"]
for(i in 1:length(STARRseq_filt_overlapping_peaks_cut)){
df_captured = data.frame("Sample" = names(STARRseq_filt_overlapping_peaks_cut)[i], "-log10(qval)" = STARRseq_filt_overlapping_peaks_cut[[i]]$V12, "Captured" = "Captured", check.names = FALSE)
df_missed = data.frame("Sample" = names(STARRseq_filt_missed_peaks_cut)[i], "-log10(qval)" = STARRseq_filt_missed_peaks_cut[[i]]$V12, "Captured" = "Missed", check.names = FALSE)
STARRseq_filt_qvals_df = rbind(STARRseq_filt_qvals_df, rbind(df_captured, df_missed))
}STARRseq_filt_qvals_df_melt = melt(STARRseq_filt_qvals_df, id.vars = c("Sample", "Captured"))ggplot(STARRseq_filt_qvals_df_melt, aes(x = value, y = Sample, fill = Captured)) +
theme_bw() +
stat_boxplot(geom = "errorbar", coef = 1.5) +
geom_boxplot(outlier.shape = NA) +
coord_cartesian(xlim = c(1, 10)) +
scale_x_continuous(breaks = seq(1, 10, by = 1)) +
scale_y_discrete(limits = rev(levels(STARRseq_filt_qvals_df_melt$Sample))) +
ylab("") +
xlab("-log10(qval)") +
labs(fill = "STARR-seq peaks") +
theme(legend.position = "bottom") +
theme(text = element_text(size = 18))tests = c()
for(i in 1:length(STARRseq_filt_overlapping_peaks_cut)){
captured = STARRseq_filt_qvals_df[(STARRseq_filt_qvals_df$Sample == names(STARRseq_filt_overlapping_peaks_cut)[i] & STARRseq_filt_qvals_df$Captured == "Captured"),]
missed = STARRseq_filt_qvals_df[(STARRseq_filt_qvals_df$Sample == names(STARRseq_filt_overlapping_peaks_cut)[i] & STARRseq_filt_qvals_df$Captured == "Missed"),]
var_test = var.test(captured$`-log10(qval)`, missed$`-log10(qval)`)
var = ifelse(var_test$p.value < 0.05, FALSE, TRUE)
test = t.test(captured$`-log10(qval)`, missed$`-log10(qval)`, var.equal = var)
tests[[i]] = test
}
names(tests) = names(STARRseq_filt_overlapping_peaks_cut)
tests## $Diagenode_C15410196_1_50_SEACR
##
## Welch Two Sample t-test
##
## data: captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 11.289, df = 2385.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.5626159 0.7991626
## sample estimates:
## mean of x mean of y
## 3.303851 2.622962
##
##
## $Diagenode_C15410196_1_50_MACS2
##
## Welch Two Sample t-test
##
## data: captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 11.857, df = 2595.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.5765521 0.8050417
## sample estimates:
## mean of x mean of y
## 3.301657 2.610861
##
##
## $CST9733_1_100_H3K27me3_SEACR
##
## Welch Two Sample t-test
##
## data: captured$`-log10(qval)` and missed$`-log10(qval)`
## t = -4.6656, df = 441.88, p-value = 4.084e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.5336144 -0.2172999
## sample estimates:
## mean of x mean of y
## 2.427459 2.802916
##
##
## $CST9733_1_100_H3K27me3_MACS2
##
## Welch Two Sample t-test
##
## data: captured$`-log10(qval)` and missed$`-log10(qval)`
## t = -4.1878, df = 501.5, p-value = 3.328e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.4947802 -0.1787796
## sample estimates:
## mean of x mean of y
## 2.466447 2.803227
##
##
## $ENCODE_H3K27ac
##
## Welch Two Sample t-test
##
## data: captured$`-log10(qval)` and missed$`-log10(qval)`
## t = 13.588, df = 7322.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.4956927 0.6628267
## sample estimates:
## mean of x mean of y
## 3.051485 2.472225
##
##
## $ENCODE_H3K27me3
##
## Welch Two Sample t-test
##
## data: captured$`-log10(qval)` and missed$`-log10(qval)`
## t = -3.7013, df = 3652.9, p-value = 0.0002177
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.27563262 -0.08473987
## sample estimates:
## mean of x mean of y
## 2.651407 2.831593
Randomly sample 6000 peaks from each sample and compare overlaps:
peakList_sub = c()
for(i in 1:length(peakList_with_ENCODE)){
peakList_sub[[i]] = peakList_with_ENCODE[[i]][sample(length(peakList_with_ENCODE[[i]]), 6000), ]
}
names(peakList_sub) = names(peakList_with_ENCODE)
sub_STARRseq_peak_overlaps_df = EpiCompare::overlap_percent(STARRseq, peakList_sub)
sub_STARRseq_peak_overlaps_df["Percentage (filt)"] = (EpiCompare::overlap_percent(STARRseq_filt, peakList_sub))$Percentage %>% round(3)
sub_STARRseq_peak_overlaps_df["Mb"] = lapply(peakList_sub, function(x){sum(GenomicRanges::width(x))/10^6}) %>% unlist() %>% round(2)
sub_STARRseq_peak_overlaps_dfKeep genome coverage the same:
test_coverage = function(peaks){sum(GenomicRanges::width(peaks)/10^6) %>% unlist() %>% round(0)}
peakList_sub_cov = c()
range_start = 500
# minimum coverage is 8Mb if also using ENCODE DNase peaks in comparison, otherwise can set higher limit
Mb = 15
for(i in 1:length(peakList_with_ENCODE)){
#print(paste("Peak", i))
# if included, use all DNase peaks
if(names(peakList_with_ENCODE)[i] == "ENCODE DNase"){
range = length(peakList_with_ENCODE$`ENCODE DNase`)
} else {
range_start = ((Mb - 2)/test_coverage(peakList_with_ENCODE[[i]]) * length(peakList_with_ENCODE[[i]])) %>% round(0)
range = range_start:length(peakList_with_ENCODE[[i]])
}
# test lower bound of range
test_peaks = peakList_with_ENCODE[[i]][sample(length(peakList_with_ENCODE[[i]]), range[1]), ]
test_cov = test_peaks %>% test_coverage()
if(test_cov == Mb){
peakList_sub_cov[[i]] = test_peaks
}
if(test_cov < Mb){
for(x in range){
sub_peaks = peakList_with_ENCODE[[i]][sample(length(peakList_with_ENCODE[[i]]), x), ]
coverage = test_coverage(sub_peaks)
if(coverage == Mb){
peakList_sub_cov[[i]] = sub_peaks
break
}
}
}
if(test_cov > Mb){
print(paste("Adjusting range for peak", i))
range_start = range_start - 500
range = range_start:length(peakList_with_ENCODE[[i]])
}
}
names(peakList_sub_cov) = names(peakList_with_ENCODE)
sub_cov_STARRseq_peak_overlaps_df = EpiCompare::overlap_percent(STARRseq, peakList_sub_cov)
sub_cov_STARRseq_peak_overlaps_df["Percentage (filt)"] = (EpiCompare::overlap_percent(STARRseq_filt, peakList_sub_cov))$Percentage %>% round(3)
sub_cov_STARRseq_peak_overlaps_df["Mb"] = lapply(peakList_sub_cov, function(x){sum(GenomicRanges::width(reduce(x)))/10^6}) %>% unlist() %>% round(2)
sub_cov_STARRseq_peak_overlaps_dfSTARRseq_peak_overlaps_df_cut = STARRseq_peak_overlaps_df[STARRseq_peak_overlaps_df$Sample %in% rownames(sub_STARRseq_peak_overlaps_df), ]
STARRseq_peak_overlaps_dfSTARRseq_peak_overlaps_df_cutsub_STARRseq_peak_overlaps_dfsub_cov_STARRseq_peak_overlaps_dfsub_STARRseq_peak_overlaps_df = sub_STARRseq_peak_overlaps_df[match(STARRseq_peak_overlaps_df_cut$Sample, rownames(sub_STARRseq_peak_overlaps_df)),]
sub_cov_STARRseq_peak_overlaps_df = sub_cov_STARRseq_peak_overlaps_df[match(STARRseq_peak_overlaps_df_cut$Sample, rownames(sub_cov_STARRseq_peak_overlaps_df)),]
STARRseq_capture_summary = data.frame("Sample" = STARRseq_peak_overlaps_df_cut$Sample, "Uncorrected" = STARRseq_peak_overlaps_df_cut$`% captured STARR-seq peaks (all)`, "Corrected for genomic coverage" = sub_cov_STARRseq_peak_overlaps_df$Percentage, check.names = FALSE)
STARRseq_capture_summary_filt = data.frame("Sample" = STARRseq_peak_overlaps_df_cut$Sample, "Uncorrected" = STARRseq_peak_overlaps_df_cut$`% captured STARR-seq peaks (filt)`, "Corrected for genomic coverage" = sub_cov_STARRseq_peak_overlaps_df$`Percentage (filt)`, check.names = FALSE)
STARRseq_capture_summarySTARRseq_capture_summary_filtSTARRseq_capture_summary_melt = melt(STARRseq_capture_summary)## Using Sample as id variables
STARRseq_capture_summary_melt$Sample = factor(STARRseq_capture_summary_melt$Sample, levels = unique(STARRseq_capture_summary_melt$Sample))
ggplot(STARRseq_capture_summary_melt, aes(y = Sample, x = value, fill = variable)) +
theme_bw() +
geom_bar(stat = "identity", width = 0.75, position = position_dodge(width = 0.75), color = "black", size = 0.2) +
scale_y_discrete(limits = rev(levels(STARRseq_capture_summary_melt$Sample))) +
ylab("") +
#xlim(0,40) +
xlab("% of STARR-seq peaks captured") +
theme(legend.position = "bottom", legend.title = element_blank()) +
guides(fill = guide_legend(ncol = 2, reverse = FALSE)) +
theme(text = element_text(size = 12)) STARRseq_capture_summary_filt_melt = melt(STARRseq_capture_summary_filt)## Using Sample as id variables
STARRseq_capture_summary_filt_melt$Sample = factor(STARRseq_capture_summary_filt_melt$Sample, levels = unique(STARRseq_capture_summary_filt_melt$Sample))
ggplot(STARRseq_capture_summary_filt_melt, aes(y = Sample, x = value, fill = variable)) +
theme_bw() +
geom_bar(stat = "identity", width = 0.75, position = position_dodge(width = 0.75), color = "black", size = 0.2) +
scale_y_discrete(limits = rev(levels(STARRseq_capture_summary_filt_melt$Sample))) +
ylab("") +
#xlim(0,40) +
xlab("% of STARR-seq peaks captured") +
theme(legend.position = "bottom", legend.title = element_blank()) +
guides(fill = guide_legend(ncol = 2, reverse = FALSE)) +
theme(text = element_text(size = 12)) sessionInfo()## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] DiffBind_3.6.2
## [2] SummarizedExperiment_1.26.1
## [3] MatrixGenerics_1.8.0
## [4] matrixStats_0.62.0
## [5] marge_0.0.4.9999
## [6] genomation_1.28.0
## [7] org.Hs.eg.db_3.15.0
## [8] BSgenome.Hsapiens.UCSC.hg19_1.4.3
## [9] BSgenome_1.64.0
## [10] Biostrings_2.64.0
## [11] XVector_0.36.0
## [12] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [13] GenomicFeatures_1.48.3
## [14] AnnotationDbi_1.58.0
## [15] Biobase_2.56.0
## [16] BRGenomics_1.8.0
## [17] clusterProfiler_4.4.2
## [18] rtracklayer_1.56.0
## [19] ReactomePA_1.40.0
## [20] ChIPseeker_1.32.0
## [21] chromVAR_1.18.0
## [22] GenomicRanges_1.48.0
## [23] GenomeInfoDb_1.32.2
## [24] IRanges_2.30.0
## [25] S4Vectors_0.34.0
## [26] BiocGenerics_0.42.0
## [27] corrplot_0.92
## [28] viridis_0.6.2
## [29] viridisLite_0.4.0
## [30] gplots_3.1.3
## [31] ggbreak_0.1.0
## [32] ggpubr_0.4.0
## [33] ggplot2_3.3.6
## [34] reshape2_1.4.4
## [35] stringr_1.4.0
## [36] dplyr_1.0.9
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.3 R.methodsS3_1.8.1
## [3] coda_0.19-4 tidyr_1.2.0
## [5] bit64_4.0.5 knitr_1.39
## [7] irlba_2.3.5 DelayedArray_0.22.0
## [9] R.utils_2.11.0 hwriter_1.3.2.1
## [11] data.table_1.14.2 KEGGREST_1.36.0
## [13] TFBSTools_1.34.0 RCurl_1.98-1.7
## [15] generics_0.1.2 preprocessCore_1.58.0
## [17] cowplot_1.1.1 RSQLite_2.2.14
## [19] shadowtext_0.1.2 bit_4.0.4
## [21] tzdb_0.3.0 enrichplot_1.16.1
## [23] xml2_1.3.3 httpuv_1.6.5
## [25] assertthat_0.2.1 DirichletMultinomial_1.38.0
## [27] apeglm_1.18.0 amap_0.8-18
## [29] xfun_0.31 hms_1.1.1
## [31] jquerylib_0.1.4 evaluate_0.15
## [33] promises_1.2.0.1 fansi_1.0.3
## [35] restfulr_0.0.14 progress_1.2.2
## [37] caTools_1.18.2 dbplyr_2.2.0
## [39] igraph_1.3.1 DBI_1.1.2
## [41] geneplotter_1.74.0 htmlwidgets_1.5.4
## [43] purrr_0.3.4 ellipsis_0.3.2
## [45] backports_1.4.1 annotate_1.74.0
## [47] gridBase_0.4-7 biomaRt_2.52.0
## [49] vctrs_0.4.1 abind_1.4-5
## [51] cachem_1.0.6 withr_2.5.0
## [53] ggforce_0.3.3 vroom_1.5.7
## [55] bdsmatrix_1.3-6 GenomicAlignments_1.32.0
## [57] treeio_1.20.0 prettyunits_1.1.1
## [59] DOSE_3.22.0 ape_5.6-2
## [61] lazyeval_0.2.2 seqLogo_1.62.0
## [63] crayon_1.5.1 genefilter_1.78.0
## [65] labeling_0.4.2 pkgconfig_2.0.3
## [67] tweenr_1.0.2 nlme_3.1-157
## [69] rlang_1.0.2 lifecycle_1.0.1
## [71] miniUI_0.1.1.1 downloader_0.4
## [73] filelock_1.0.2 BiocFileCache_2.4.0
## [75] seqPattern_1.28.0 AnnotationHub_3.4.0
## [77] invgamma_1.1 polyclip_1.10-0
## [79] graph_1.74.0 Matrix_1.4-1
## [81] aplot_0.1.6 ashr_2.2-54
## [83] carData_3.0-5 boot_1.3-28
## [85] png_0.1-7 rjson_0.2.21
## [87] bitops_1.0-7 R.oo_1.24.0
## [89] KernSmooth_2.23-20 blob_1.2.3
## [91] mixsqp_0.3-43 SQUAREM_2021.1
## [93] qvalue_2.28.0 ShortRead_1.54.0
## [95] jpeg_0.1-9 readr_2.1.2
## [97] rstatix_0.7.0 gridGraphics_0.5-1
## [99] ggsignif_0.6.3 CNEr_1.32.0
## [101] reactome.db_1.81.0 scales_1.2.0
## [103] memoise_2.0.1 graphite_1.42.0
## [105] magrittr_2.0.3 plyr_1.8.7
## [107] zlibbioc_1.42.0 compiler_4.2.0
## [109] scatterpie_0.1.7 bbmle_1.0.25
## [111] BiocIO_1.6.0 RColorBrewer_1.1-3
## [113] plotrix_3.8-2 DESeq2_1.36.0
## [115] Rsamtools_2.12.0 cli_3.3.0
## [117] systemPipeR_2.2.2 patchwork_1.1.1
## [119] MASS_7.3-57 tidyselect_1.1.2
## [121] stringi_1.7.6 highr_0.9
## [123] emdbook_1.3.12 yaml_2.3.5
## [125] GOSemSim_2.22.0 locfit_1.5-9.5
## [127] latticeExtra_0.6-29 ggrepel_0.9.1
## [129] sass_0.4.1 fastmatch_1.1-3
## [131] tools_4.2.0 parallel_4.2.0
## [133] rstudioapi_0.13 TFMPvalue_0.0.8
## [135] gridExtra_2.3 plyranges_1.16.0
## [137] farver_2.1.0 ggraph_2.0.5
## [139] BiocManager_1.30.18 digest_0.6.29
## [141] shiny_1.7.1 pracma_2.3.8
## [143] Rcpp_1.0.8.3 car_3.0-13
## [145] broom_0.8.0 BiocVersion_3.15.2
## [147] later_1.3.0 httr_1.4.3
## [149] colorspace_2.0-3 XML_3.99-0.9
## [151] truncnorm_1.0-8 splines_4.2.0
## [153] yulab.utils_0.0.4 tidytree_0.3.9
## [155] EpiCompare_1.0.0 graphlayouts_0.8.0
## [157] ggplotify_0.1.0 plotly_4.10.0
## [159] xtable_1.8-4 jsonlite_1.8.0
## [161] ggtree_3.4.0 poweRlaw_0.70.6
## [163] tidygraph_1.2.1 UpSetR_1.4.0
## [165] ggfun_0.0.6 R6_2.5.1
## [167] pillar_1.7.0 htmltools_0.5.2
## [169] mime_0.12 glue_1.6.2
## [171] fastmap_1.1.0 DT_0.23
## [173] BiocParallel_1.30.3 interactiveDisplayBase_1.34.0
## [175] codetools_0.2-18 fgsea_1.22.0
## [177] GreyListChIP_1.28.1 mvtnorm_1.1-3
## [179] utf8_1.2.2 lattice_0.20-45
## [181] bslib_0.3.1 tibble_3.1.7
## [183] numDeriv_2016.8-1.1 curl_4.3.2
## [185] gtools_3.9.2.1 GO.db_3.15.0
## [187] survival_3.3-1 limma_3.52.1
## [189] rmarkdown_2.14 munsell_0.5.0
## [191] DO.db_2.9 GenomeInfoDbData_1.2.8
## [193] impute_1.70.0 gtable_0.3.0